This is the R Markdown Notebook created for the JCI analysis. Notebook and data are available at GitHub.
source('functions.R')
library(reldist)
library(gglorenz)
library(ggplot2)
library(GGally)
library(dplyr)
library(ggnewscale)
set.seed(13)
First line only can be executed if you have downloaded data from JCR in multiple files, otherwise, use the second line which includes the anonymized JCR data.
#df <- read_jcr('data/files/')
df <- read.delim('data/jcr.tsv', stringsAsFactors=FALSE, check.names=FALSE)
Some descriptive statistics of the dataset.
summary(df$`2020 JCI`)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.3400 0.6400 0.8058 1.0300 77.6400 66
Gini Index is calculated for each JCR indicator.
gini(df$`2020 JIF`[which(!is.na(df$`2020 JIF`))])
## [1] 0.4444105
gini(df$`2020 JCI`[which(!is.na(df$`2020 JCI`))])
## [1] 0.4365864
gini(df$`5 Year JIF` [which(!is.na(df$`5 Year JIF`))])
## [1] 0.4403715
gini(df$`Immediacy Index`[which(!is.na(df$`Immediacy Index`))])
## [1] 0.6220866
gini(df$`Eigenfactor`[which(!is.na(df$`Eigenfactor`))])
## [1] 0.8019052
gini(df$`Total Citations`[which(!is.na(df$`Total Citations`))])
## [1] 0.7918109
gini(df$`Article Influence Score`[which(!is.na(df$`Article Influence Score`))])
## [1] 0.5499661
A new data.frame is created to obtain the Lorenz curve in an easy way.
df_den <- data.frame()
for(x in c('2020 JCI','2020 JIF', '5 Year JIF', 'Immediacy Index', 'Eigenfactor', 'Total Citations', 'Article Influence Score')){
aux <-df[,c('Cat','DB',x)]
names(aux)[3] <- 'Value'
aux$Indicator <- x
df_den <- rbind.data.frame(df_den, aux, stringsAsFactors = FALSE)
}
df_den[which(df_den$Indicator=='2020 JCI'),'Indicator'] <- 'Journal Citation Indicator'
df_den[which(df_den$Indicator=='2020 JIF'),'Indicator'] <- 'Journal Impact Factor'
df_den[which(df_den$Indicator=='5 Year JIF'),'Indicator'] <- '5-Year Journal Impact Factor'
The Lorenz curve is obtained.
ggplot(df_den, aes(x=Value, color=Indicator)) +
stat_lorenz(desc=FALSE, alpha=0.7, size=1) +
geom_abline(size=0.8) +
scale_color_manual(values=c('#fb0007','#129176', '#e5c421', '#f46f08', '#4cacd0', '#291b37', '#de8bb7'))+
theme_light() +
theme(panel.grid=element_blank(),
legend.title=element_blank(),
text=element_text(family='Arial', size=12.5, color='black'),
axis.text=element_text(color='black', size=12),
axis.ticks=element_line(color='black'),
legend.position='bottom',
panel.border=element_rect(colour='black'),
strip.background=element_rect(colour='black', fill='black'),
strip.text=element_text(size=14),
legend.text=element_text(size=10))+
guides(fill=guide_colorbar(label.position='bottom',
title.position='left', title.vjust=0.85)) +
labs(y='Cummulative percentage of indicator', x='Cummulative percentage of journals')
A new data.frame is created for obtaining a matrix of plots.
m_plot <- df[,c('2020 JCI', '2020 JIF', '5 Year JIF', 'Immediacy Index', 'Eigenfactor', 'Total Citations', 'Article Influence Score', 'DB')]
names(m_plot)[c(1,2,7)] <- c('JCI', 'JIF', 'AIS')
The matrix of plots with the correlations and distribution of data is generated. All of them are significant.
ggpairs(m_plot, columns=1:7,
upper=list(continuous=wrap('cor', size=5, method='pearson')),
lower=list(continuous=wrap('points', alpha=0.6)))+
theme(text=element_text(family='Arial', size=12.5, color='black'),
axis.text=element_text(color='black', size=11),
axis.ticks=element_line(color='black'),
strip.background=element_rect(colour='black', fill='black'),
strip.text=element_text(size=12, color='white'),
legend.position='none'
)
Pearson correlations between the seven indicators are calculated.
df_cor <- data.frame()
for(x in c('2020 JIF', '5 Year JIF', 'Immediacy Index', 'Eigenfactor', 'Total Citations', 'Article Influence Score')){
aux <-biblio_cor(df, x, '2020 JCI')
aux$Indicator <- x
df_cor <- rbind.data.frame(df_cor, aux, stringsAsFactors = FALSE)
}
df_cor <- df_cor[which(!is.na(df_cor$Correlation)),]
ESI field are assigned to each Web of Science Category.
esi <- read.csv2('data/final_map.csv', stringsAsFactors=FALSE)
esi[which(esi$WC=="Women's Studies"), 'WC'] <- 'Womens Studies'
esi$WC <- toupper(esi$WC)
df_cor <- inner_join(df_cor, esi[,c('WC', 'ESI')], by=c('Cat'='WC'))
Firstly, correlations between JCI and JIF and 5-Year JIF are calculated.
jci_jif <- df_cor[which((df_cor$Indicator %in% c('2020 JIF', '5 Year JIF')) & (df_cor$DB %in% c('SSCI', 'SCIE'))),]
jci_jif[which(jci_jif$DB=='SSCI'),'DB'] <- 'Social Sciences'
jci_jif[which(jci_jif$DB=='SCIE'),'DB'] <- 'Science'
jci_jif[which(jci_jif$Indicator=='2020 JIF'),'Indicator'] <- 'Journal Impact Factor'
jci_jif[which(jci_jif$Indicator=='5 Year JIF'),'Indicator'] <- '5-Year Journal Impact Factor'
jci_jif$Indicator <- factor(jci_jif$Indicator, levels=c('Journal Impact Factor', '5-Year Journal Impact Factor'), ordered=TRUE)
There are 13 correlations with a p-value less than 0.01 (6 in JIF and 7 in 5-Year JIF).
dim(jci_jif[which(jci_jif$p > 0.001),])
## [1] 13 9
mean(jci_jif[which(jci_jif$p < 0.001 & jci_jif$Indicator=='Journal Impact Factor'),'Correlation'])
## [1] 0.8923508
mean(jci_jif[which(jci_jif$p < 0.001 & jci_jif$Indicator=='Journal Impact Factor' & jci_jif$DB == 'Science'),'Correlation'])
## [1] 0.9036724
mean(jci_jif[which(jci_jif$p < 0.001 & jci_jif$Indicator=='Journal Impact Factor' & jci_jif$DB == 'Social Sciences'),'Correlation'])
## [1] 0.8571731
Boxplot with correlations greater than 0.6 is generated.
ggplot(data=jci_jif[which(jci_jif$p < 0.001),], aes(y=Correlation, x=DB, group=DB))+
geom_boxplot(outlier.shape=NA, width=0.8) +
geom_jitter(data=jci_jif[which(jci_jif$Indicator=='Journal Impact Factor' & jci_jif$p < 0.001),], aes(fill=Correlation, size=Docs), shape=21, color='black', stroke=0.5, alpha=0.6)+
scale_fill_gradient(low='white', high='#eb2701', na.value=NA, breaks=c(0.6, 0.8, 1), limits=c(0.6,1))+
new_scale_fill()+
geom_jitter(data=jci_jif[which(jci_jif$Indicator=='5-Year Journal Impact Factor' & jci_jif$p < 0.001),], aes(fill=Correlation, size=Docs), shape=21, color='black', stroke=0.5, alpha=0.6)+
scale_fill_gradient(low='white', high='blue', na.value=NA, breaks=c(0.6, 0.8, 1), limits=c(0.6,1))+
ylim(c(0.6,1))+
scale_size_continuous(range=c(1,5), breaks=c(0, 25000,50000, 100000, 125000, 150000, 200000))+
theme_light()+
theme(panel.grid=element_blank(),
text=element_text(family='Arial', size=12.5, color='black'),
axis.text=element_text(color='black', size=11),
axis.ticks=element_line(color='black'),
legend.position='none',
legend.box='vertical',
panel.border=element_rect(colour='black'),
strip.background=element_rect(colour='black', fill='black'),
strip.text = element_text(size=14))+
guides(fill=guide_colorbar(label.position='bottom',
title.position='left', title.vjust=0.85)) +
labs(x='', y='Correlation with JCI')+
facet_wrap(~Indicator)
The complete boxplot does not vary much, only 7 more categories are included.
ggplot(data=jci_jif, aes(y=Correlation, x=DB, group=DB))+
geom_boxplot(outlier.shape=NA, width=0.8) +
geom_jitter(data=jci_jif[which(jci_jif$Indicator=='Journal Impact Factor'),], aes(fill=Correlation, size=Docs), shape=21, color='black', stroke=0.5, alpha=0.6)+
scale_fill_gradient(low='white', high='#eb2701', na.value=NA, breaks=c(0.5, 0.75, 1), limits=c(0.5,1))+
new_scale_fill()+
geom_jitter(data=jci_jif[which(jci_jif$Indicator=='5-Year Journal Impact Factor'),], aes(fill=Correlation, size=Docs), shape=21, color='black', stroke=0.5, alpha=0.6)+
scale_fill_gradient(low='white', high='blue', na.value=NA, breaks=c(0, 0.5, 1), limits=c(0,1))+
scale_size_continuous(range=c(1,5), breaks=c(0, 25000,50000, 100000, 125000, 150000, 200000))+
theme_light()+
theme(panel.grid=element_blank(),
text=element_text(family='Arial', size=12.5, color='black'),
axis.text=element_text(color='black', size=11),
axis.ticks=element_line(color='black'),
legend.position='none',
legend.box='vertical',
panel.border=element_rect(colour='black'),
strip.background=element_rect(colour='black', fill='black'),
strip.text = element_text(size=14))+
guides(fill=guide_colorbar(label.position='bottom',
title.position='left', title.vjust=0.85)) +
labs(x='')+
facet_wrap(~Indicator)
The correlations between the JCI and the rest of the indicators are calculated, but taking into account only those categories with more than 25 journals. This is because a few small categories greatly alter the results.
ot_indicators <- df_cor[which((!(df_cor$Indicator %in% c('2020 JIF', '5 Year JIF'))) & df_cor$Journals>25),]
ot_indicators[which(ot_indicators$DB=='SSCI'),'DB'] <- 'Social\nSciences'
ot_indicators[which(ot_indicators$DB=='SCIE'),'DB'] <- 'Science'
ot_indicators[which(ot_indicators$DB=='AHCI'),'DB'] <- 'Arts &\nHumanities'
ot_indicators[which(ot_indicators$DB=='ESCI'),'DB'] <- 'Emerging\nSources'
There are 118 correlations with a p-value less than 0.01 (43 in Total Citations, 33 in Immediacy Index and Eigenfactor, and 9 in Article Influence Score).
table(ot_indicators[which(ot_indicators$p >= 0.01),'Indicator'])
##
## Article Influence Score Eigenfactor Immediacy Index
## 9 33 33
## Total Citations
## 43
Boxplots are generated.
ggplot(data=ot_indicators, aes(y=Correlation, x=DB, group=DB))+
geom_boxplot(outlier.shape=NA, width=0.8) +
geom_jitter(aes(fill=Correlation, size=Docs), shape=21, color='black', stroke=0.5, alpha=0.6)+
scale_fill_gradient(low='white', high='#eb2701', na.value=NA, breaks=c(0, 0.5, 1), limits=c(0,1))+
scale_size_continuous(range=c(0.1,3.5), breaks=c(0, 25000,50000, 100000, 125000, 150000, 200000))+
theme_light()+
theme(panel.grid=element_blank(),
text=element_text(family='Arial', size=12.5, color='black'),
axis.text=element_text(color='black', size=10.5),
axis.ticks=element_line(color='black'),
legend.position='bottom',
panel.border=element_rect(colour='black'),
strip.background=element_rect(colour='black', fill='black'),
strip.text=element_text(size=14))+
guides(fill = guide_colorbar(label.position='bottom',
title.position='left', title.vjust=0.85)) +
labs(x='')+
facet_wrap(~Indicator)
Firstly, we rank JIF and JCI values by categories.
df_rank <- df[which(!(is.na(df$`2020 JIF`) | is.na(df$`2020 JCI`))),]
length(unique(df_rank$Category))
## [1] 236
df_rank$quantile_jif <- NA
df_rank$quantile_jci <- NA
for(ca in unique(df$Category)){
df_rank[which(df_rank$Category==ca),'quantile_jif'] <- trunc(rank(-df_rank[which(df_rank$Category==ca),'2020 JIF'], ties.method = 'random'))
df_rank[which(df_rank$Category==ca),'quantile_jci'] <- trunc(rank(-df_rank[which(df_rank$Category==ca),'2020 JCI'], ties.method = 'random'))
}
A linear regression model is generated with JIF and JCI ranks.
lm_g <- lm(quantile_jif~quantile_jci, df_rank[which(df_rank$DB %in% c('SSCI', 'SCIE')),])
summary(lm_g)
##
## Call:
## lm(formula = quantile_jif ~ quantile_jci, data = df_rank[which(df_rank$DB %in%
## c("SSCI", "SCIE")), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -206.610 -6.640 -1.233 6.139 211.112
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.651088 0.213643 17.09 <2e-16 ***
## quantile_jci 0.947728 0.002255 420.36 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.43 on 20030 degrees of freedom
## Multiple R-squared: 0.8982, Adjusted R-squared: 0.8982
## F-statistic: 1.767e+05 on 1 and 20030 DF, p-value: < 2.2e-16
The models obtained for SSCI and SCIE journals have a good R2.
lm_ssci <- lm(quantile_jif~quantile_jci, df_rank[which(df_rank$DB == 'SSCI'),])
summary(lm_ssci)$r.squared
## [1] 0.8851921
lm_scie <- lm(quantile_jif~quantile_jci, df_rank[which(df_rank$DB == 'SCIE'),])
summary(lm_scie)$r.squared
## [1] 0.9029314
df_r2 <- data.frame(quantile_jci=c(Inf,Inf),
quantile_jif=c(10,10),
text=c(paste0('y==', format(coef(lm_ssci)[1], digits = 2), '+', format(coef(lm_ssci)[2], digits = 2), '~x~', '~~italic(r)^2==', round(summary(lm_ssci)$r.squared,3),''),
paste0('y==', format(coef(lm_scie)[1], digits = 2), '+', format(coef(lm_scie)[2], digits = 2), '~x~', '~~italic(r)^2==', round(summary(lm_scie)$r.squared,3),'')),
DB=c('SSCI','SCIE'))
ggplot(data=df_rank[which(df_rank$DB %in% c('SSCI', 'SCIE')),], aes(x=quantile_jci, y=quantile_jif)) +
geom_point(color='#1f77b4', size=1, alpha=0.75, stroke=0)+
geom_smooth(formula = 'y ~ x', method='lm', color='black')+
geom_text(data=df_r2, aes(x=quantile_jci, y=quantile_jif, label=text, group=DB),
parse = TRUE, hjust=1.05) +
theme_light() +
labs(x='JCI rank', y='JIF rank') +
facet_wrap(.~DB, scales = 'free') +
theme(text=element_text(family='Arial', size=12.5, color='black'),
axis.text=element_text(color='black', size=11),
axis.ticks=element_line(color='black'),
panel.border=element_rect(color='black'),
strip.background=element_rect(color='black', fill='black'),
strip.text=element_text(size=12, color='white'),
legend.position='none'
)
The average R2 value of the linear regression models is calculated.
df_r <- data.frame(category=unique(df_rank$Category[which(df_rank$DB %in% c('SSCI', 'SCIE'))]),
r2=NA,
DB=NA,
stringsAsFactors = FALSE)
for(ca in unique(df_r$category)){
lm_aux <- NA
lm_aux <-lm(quantile_jci~quantile_jif,df_rank[which(df_rank$Category==ca & !is.na(df_rank$`2020 JIF`) & !is.na(df_rank$`2020 JCI`)),])
df_r[which(df_r$category==ca),'r2'] <- summary(lm_aux)$r.squared
df_r[which(df_r$category==ca),'DB'] <- unique(df_rank[which(df_rank$Category==ca),'DB'])
}
mean(df_r$r2, na.rm = TRUE)
## [1] 0.8064401
mean(df_r$r2[which(df_r$DB=='SSCI')], na.rm = TRUE)
## [1] 0.7380394
mean(df_r$r2[which(df_r$DB=='SCIE')], na.rm = TRUE)
## [1] 0.8287279
As specific study case, the linear regression model for Information Science & Library Science is generated.
df_lib <- df[which(df$Cat == 'INFORMATION SCIENCE & LIBRARY SCIENCE'),]
df_lib_train <- df_lib[which(!(is.na(df_lib$`2020 JIF`) | is.na(df_lib$`2020 JCI`))),]
df_lib_train$quantile_jif <- trunc(rank(-df_lib_train[,'2020 JIF'], ties.method = 'first'))
df_lib_train$quantile_jci <- trunc(rank(-df_lib_train[,'2020 JCI'], ties.method = 'first'))
lib_lm <- lm(quantile_jif~quantile_jci, df_lib_train)
JIF ranking is predicted and compared to the original JIF ranking.
df_lib_predict <- df_lib[which(!(is.na(df_lib$`2020 JCI`))),]
df_lib_predict$quantile_jci <- trunc(rank(-df_lib_predict[,'2020 JCI'], ties.method = 'first'))
df_lib_predict$quantile_jif <- trunc(rank(-df_lib_predict[,'2020 JIF'], ties.method = 'first'))
df_lib_predict$quantile_jif[which(is.na(df_lib_predict$`2020 JIF`))] <- NA
df_lib_predict$quantile_jci_p <- predict(lib_lm, newdata = df_lib_predict)
df_lib_predict$quantile_jci_p <- rank(as.numeric(df_lib_predict$quantile_jci_p), ties.method = 'random')
ggplot() +
geom_point(data=df_lib_predict, aes(x='JIF', y=quantile_jif, color=DB), alpha=0.7) +
geom_point(data=df_lib_predict, aes(x='Predicted JIF', y=quantile_jci_p, color=DB), alpha=0.7) +
geom_segment(data =df_lib_predict, aes(x = 'Predicted JIF', xend = 'JIF', y = quantile_jci_p, yend = quantile_jif),
inherit.aes = FALSE,
color='grey20',
size=0.25) +
geom_text(aes(y=c(1,dim(df_lib_predict)[1]/4, 2*dim(df_lib_predict)[1]/4, 3*dim(df_lib_predict)[1]/4),
x=Inf,
label=c('Q1','Q2','Q3','Q4')),
hjust=1.2, vjust=1)+
scale_color_manual(values=c('#fc4e08', '#28b7c1'), labels = c('ESCI journals', 'SSCI journals'))+
geom_hline(yintercept = c(dim(df_lib_predict)[1]/4, 2*dim(df_lib_predict)[1]/4, 3*dim(df_lib_predict)[1]/4), size=0.25)+
coord_flip()+
scale_x_discrete(limits=rev)+
scale_y_continuous(trans = "reverse", breaks = c(1,50,100,150)) +
labs(title = '', y='Ranking position', x='')+
theme_classic()+
theme(text=element_text(family='Arial', size=12.5, color='black'),
axis.text=element_text(color='black', size=11),
axis.ticks=element_line(color='black'),
legend.position = 'bottom',
legend.title = element_blank())